rm(list=ls())
require(quanteda)
require(stringi)
library(furrr)
library(future)
library(dplyr)
source("functions.R")

plan(multisession, workers = 32)

corpus_dir <- "data/"
corpus_files <- list.files(corpus_dir, pattern = "corpus_.*\\.RDS$", full.names = TRUE)


future_map(corpus_files, function(file) {
  country <- gsub("corpus_(.*)\\.RDS", "\\1", basename(file))
  message("Processing: ", country)
  
  corp <- readRDS(file)
  
  toks <- tokenize_arabic(corp)
  toks <- tokens_compound(toks, dict, concatenator = " ")
  toks <- tokens_remove(toks, dict$stopwords, padding = TRUE)
  toks <- tokens_remove(toks, month, valuetype = "regex")
  
  saveRDS(toks, paste0("data/tokens_", country, ".RDS"))
  
  rm(corp, toks); gc()
  
  return(NULL)
}, .options = furrr_options(seed = TRUE))

